Representing a tree with dictionaries

https://blog.finxter.com/5-best-ways-to-construct-and-manage-a-tree-in-python/ https://builtin.com/articles/tree-python #This one is more complex https://bigtree.readthedocs.io/en/0.14.8/ #There is this package to create trees, but maybe it is too complex for us Pouly, Marc. “Estimating Text Similarity based on Semantic Concept Embeddings.” arXiv preprint arXiv:2401.04422 (2024).

import torch
import einops
import math


from transformers import AutoModel
# Load the Jina AI embeddings model


model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)

taxonomy_tree = {
    '1': {
        '2': {
            'A': 'Lake',
            'B': 'River'
        },
        'C': 'House',
        '3': {
            '4': {
                'D': 'Mountain',
                'E': 'Everest',
                'F': 'Volcano'
            }
        }
    }
}


# Function to extract leaf nodes
def get_leaf_nodes(taxonomy):
    leaves = {}
    def traverse(node, path):
        if isinstance(node, dict):
            for k, v in node.items():
                traverse(v, path + [k])
        else:
            leaves[path[-1]] = node  # Leaf node with its path
    traverse(taxonomy, [])
    return leaves

# Function to calculate similarity using the Jina AI embeddings model
def calculate_similarity(text1, text2):
    # Encode texts to get embeddings
    embeddings = model.encode([text1, text2])
    # Calculate cosine similarity
    sim = torch.nn.functional.cosine_similarity(torch.tensor(embeddings[0]), torch.tensor(embeddings[1]), dim=0)
    return sim.item()

# Function to calculate R(T)
def calculate_r_t(taxonomy):
    leaves = get_leaf_nodes(taxonomy)
    leaf_names = list(leaves.values())
    groups = [leaf_names[i:i + 2] for i in range(0, len(leaf_names), 2)]  # Grouping pairs

    total_groups = len(groups)
    r_t_values = []

    for group in groups:
        # Calculate pairwise similarities within the group
        similarities = []
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                sim = calculate_similarity(group[i], group[j])
                similarities.append(sim)

        if similarities:
            min_similarity = min(similarities)
        else:
            min_similarity = 0  # No pairs means no intruders possible

        # Count intruders
        intruder_count = 0
        for leaf in leaf_names:
            if leaf not in group:
                sim_with_group = calculate_similarity(leaf, group[0])
                if sim_with_group > min_similarity:
                    intruder_count += 1

        # Calculate R(T) for this group
        n_ic = intruder_count
        n_gc = len(group)
        n_ac = len(leaf_names)

        r_t = (1 - (n_ic / (n_gc * (n_ac - n_gc)))) if n_gc * (n_ac - n_gc) > 0 else 0
        r_t_values.append(r_t)

    return sum(r_t_values) / total_groups if total_groups > 0 else 0
  

def extract_ncat(taxonomy):
    ncat = 0
    first_category_found = False  # Flag to track if the first category has been encountered

    def count_categories(node, is_root=True):
        nonlocal ncat, first_category_found
        if isinstance(node, dict):
            # Only count nodes that are not the root and not leaves
            if not is_root:
                if not first_category_found:
                    first_category_found = True  # Set the flag after the first category is found
                else:
                    ncat += 1  # Count the intermediate category
                    print(f"Found category: {list(node.keys())}")  # Print the keys of the current category
            # Recursively process children, marking them as non-root
            for child in node.values():
                count_categories(child, is_root=False)

    count_categories(taxonomy)
    return ncat



def extract_nchar(taxonomy):
    nchar = 0

    def count_characteristics(node):
        nonlocal nchar
        if isinstance(node, dict):
            for child in node.values():
                count_characteristics(child)
        else:
            nchar += 1  # Count the current characteristic

    count_characteristics(taxonomy)
    return nchar

def extract_depths_cat(taxonomy):
    depths_cat = []

    def find_depths(node, depth):
        if isinstance(node, dict):
            depths_cat.append(depth)  # Record the depth of this category
            for child in node.values():
                find_depths(child, depth + 1)

    find_depths(taxonomy, 0)  # Start from depth 0
    return depths_cat
  
  
def extract_depths_char(taxonomy):
    depths_char = []

    def find_characteristic_depths(node, depth):
        if isinstance(node, dict):
            for child in node.values():
                find_characteristic_depths(child, depth + 1)
        else:
            depths_char.append(depth)  # Record the depth of this characteristic

    find_characteristic_depths(taxonomy, 0)  # Start from depth 0
    return depths_char



import math

def calculate_conciseness(ncat, nchar, depths_cat, depths_char):
    """
    Calculate the conciseness of the taxonomy using the proposed formula.

    Parameters:
    ncat (int): The number of categories.
    nchar (int): The number of characteristics.
    depths_cat (list): A list of depths for categories.
    depths_char (list): A list of depths for characteristics.

    Returns:
    float: The conciseness value of the taxonomy.
    """
    # Calculate the sum of the inverses of the depths for categories and characteristics
    # Only include depths greater than 0 to avoid division by zero
    sum_cat = sum(1 / d for d in depths_cat if d > 0) if ncat > 0 else 0  # Sum for categories
    sum_char = sum(1 / d for d in depths_char if d > 0) if nchar > 0 else 0  # Sum for characteristics

    # Calculate the total sum of inverses of depths
    total_sum = sum_cat + sum_char

    # Calculate conciseness using the provided formula
    if total_sum > 0:
        C_T = 1 / (1 + math.log(total_sum - 1))
    else:
        C_T = 0  # Return 0 if total_sum is not positive

    return C_T

  
ncat = extract_ncat(taxonomy_tree)
## Found category: ['A', 'B']
## Found category: ['4']
## Found category: ['D', 'E', 'F']
nchar = extract_nchar(taxonomy_tree)
depths_cat = extract_depths_cat(taxonomy_tree)
depths_char = extract_depths_char(taxonomy_tree)

print("Number of categories (ncat):", ncat)
## Number of categories (ncat): 3
print("Number of characteristics (nchar):", nchar)
## Number of characteristics (nchar): 6
print("Depths of categories:", depths_cat)
## Depths of categories: [0, 1, 2, 2, 3]
print("Depths of characteristics:", depths_char)
## Depths of characteristics: [3, 3, 2, 4, 4, 4]
# Calculate R(T) for the given taxonomy
leaves=get_leaf_nodes(taxonomy_tree)
print(leaves)
## {'A': 'Lake', 'B': 'River', 'C': 'House', 'D': 'Mountain', 'E': 'Everest', 'F': 'Volcano'}
robustness_value = calculate_r_t(taxonomy_tree)
print(f"Robustness R(T): {robustness_value:.4f}")
## Robustness R(T): 0.9583
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
## The conciseness of the taxonomy is: 0.45899878671895267

1st paper a software cost estimation taxonomy for global software development projects

new_taxonomy = {
    'Cost estimation for GSD': {
        'Cost estimation context': {
            'Planning': {
                "Conceptualization": "Conceptualization",
                "Feasibility study": "Feasibility study",
                "Preliminary planning": "Preliminary planning",
                "Detail Planning": "Detail planning",
                "Execution": "Execution",
                "Commissioning": "Commissioning"
            },
            'Project activities': {
                "System investigation": "System investigation",
                "Analysis": "Analysis",
                "Design": "Design",
                "Implementation": "Implementation",
                "Testing": "Testing",
                "Maintenance": "Maintenance",
                "Other Project Activities": "Project Activities.Other"
            },
            'Project domain': {
                "SE": "Systems Engineering",
                "Research & Dev": {
                    "Telecommunication": "Telecommunication"
                },
                "Finance": "Finance",
                "Healthcare": "Healthcare",
                "Other Project Domain": "Project Domain.Other"
            },
            'Project setting': {
                "Close onshore": "Close onshore",
                "Distant onshore": "Distant onshore",
                "Near offshore": "Near offshore",
                "Far offshore": "Far offshore"
            },
            'Planning approaches': {
                "Constructive Cost Model": "Constructive Cost Model",
                "Capability Maturity Model Integration": "Capability Maturity Model Integration",
                "Agile": "Agile",
                "Delphi": "Delphi",
                "GA": "Genetic Algorithms",
                "CBR": "Case-Based Reasoning",
                "Fuzzy similar": "Fuzzy similar",
                "Other planning approaches": "Planning Approaches.other"
            },
            'Number of sites': {
                "Value of number of sites": "Number of sites.Value"
            },
            'Team size': {
                "No of team members": "Number of team members"
            }
        },
        'Estimation technique': {
            'Estimation technique': {
                "Expert judgment": "Expert judgment",
                "Machine learning": "Machine learning",
                "Non-machine learning": "Non-machine learning"
            },
            'Use technique': {
                "Individual": "Individual",
                "Group-based estimation": "Group-based estimation"
            }
        },
        'Cost estimate': {
            'Estimated cost': {
                "Estimate value": "Estimated value"
            },
            'Actual cost': {
                "Value": "Actual cost.Value"
            },
            'Estimation dimension': {
                "Effort hours": "Effort hours",
                "Staff/cost": "Staff/cost",
                "Hardware": "Hardware",
                "Risk": "Risk",
                "Portfolio": "Portfolio"
            },
            'Accuracy measure': {
                "Baseline comparison": "Baseline comparison",
                "Variation reduction": "Variation reduction",
                "Sensitivity analysis": "Sensitivity analysis"
            }
        },
        'Cost estimators': {
            'Product size': {
                "Size report": "Size report",
                "Statistics analysis": "Statistics analysis"
            },
            'Team experience': {
                "Considered": "Team experience.Considered",
                "Not considered": "Team experience.Not considered"
            },
            'Team structure': {
                "Considered": "Team structure.Considered",
                "Not Considered": "Team structure.Not considered"
            },
            'Product requirement': {
                "Performance": "Performance",
                "Security": "Security",
                "Availability": "Availability",
                "Reliability": "Reliability",
                "Maintainability": "Maintainability",
                "Other requirement": "Producte requirement.Other"
            },
            'Distributed teams distances': {
                "Geographical distance": "Geographical distance",
                "Temporal distance": "Temporal distance",
                "Socio-cultural distance": "Socio-cultural distance"
            }
        }
    }
}

bajta_tax = new_taxonomy
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy)
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')

2nd paper, A taxonomy of web effort predictors

new_taxonomy = {
    'Web Predictor': {
        'Size Metric': {
            'Length': {
                        'Web page count': 'Web page count',
                        'Media count': 'Media count',
                        'New media count': 'New media count',
                        'New Web page count': 'New Web page count',
                        'Link count': 'Link count',
                        'Program count': 'Program count',
                        'Reused component count': 'Reused component count',
                        'Lines of code': 'Lines of code',
                        'Reused program count': 'Reused program count',
                        'Reused media count': 'Reused media count',
                        'Web page allocation': 'Web page allocation',
                        'Reused lines of code': 'Reused lines of code',
                        'Media allocation': 'Media allocation',
                        'Reused media allocation': 'Reused media allocation',
                        'Entity count': 'Entity count',
                        'Attribute count': 'Attribute count',
                        'Component count': 'Component count',
                        'Statement count': 'Statement count',
                        'Node count': 'Node count',
                        'Collection slot size': 'Collection slot size',
                        'Component granularity level': 'Component granularity level',
                        'Slot granularity level': 'Slot granularity level',
                        'Model node size': 'Model node size',
                        'Cluster node size': 'Cluster node size',
                        'Node slot size': 'Node slot size',
                        'Publishing model unit count': 'Publishing model unit count',
                        'Model slot size': 'Model slot size',
                        'Association slot size': 'Association slot size',
                        'Client script count': 'Client script count',
                        'Server script count': 'Server script count',
                        'Information slot count': 'Information slot count',
                        'Association center slot count': 'Association center slot count',
                        'Collection center slot count': 'Collection center slot count',
                        'Component slot count': 'Component slot count',
                        'Semantic association count': 'Semantic association count',
                        'Segment count': 'Segment count',
                        'Slot count': 'Slot count',
                        'Cluster slot count': 'Cluster slot count',
                        'Cluster count': 'Cluster count',
                        'Publishing unit count': 'Publishing unit count',
                        'Section count': 'Section count',
                        'Inner/sub concern count': 'Inner/sub concern count',
                        'Indifferent concern count': 'Indifferent concern count',
                        'Module point cut count': 'Module point cut count',
                        'Module count': 'Module count',
                        'Module attribute count': 'Module attribute count',
                        'Operation count': 'Operation count',
                        'Comment count': 'Comment count',
                        'Reused comment count': 'Reused comment count',
                        'Media duration': 'Media duration',
                        'Diffusion cut count': 'Diffusion cut count',
                        'Concern module count': 'Concern module count',
                        'Concern operation count': 'Concern operation count',
                        'Anchor count': 'Anchor count'},
            'Functionality': {
                        'High feature count': 'High feature count',
                        'Low feature count': 'Low feature count',
                        'Reused high feature count': 'Reused high feature count',
                        'Reused low feature count': 'Reused low feature count',
                        'Web objects': 'Web objects',
                        'Common Software Measurement International Consortium': 'Common Software Measurement International Consortium',
                        'International Function Point Users Group': 'International Function Point Users Group',
                        'Object-Oriented Heuristic Function Points': 'Object-Oriented Heuristic Function Points',
                        'Object-Oriented Function Points': 'Object-Oriented Function Points',
                        'Use case count': 'Use case count',
                        'Feature count': 'Feature count',
                        'Data Web points': 'Data Web points'},
            
            'Object-oriented': {
                        'Cohesion': 'Cohesion',
                        'Class coupling': 'Class coupling',
                        'Concern coupling': 'Concern coupling'}, 

            'Complexity': {
                        'Connectivity density': 'Connectivity density',
                        'Cyclomatic complexity': 'Cyclomatic complexity',
                        'Model collection complexity': 'Model collection complexity',
                        'Model association complexity': 'Model association complexity',
                        'Model link complexity': 'Model link complexity',
                        'Page complexity': 'Page complexity',
                        'Component complexity': 'Component complexity',
                        'Total complexity': 'Total complexity',
                        'Adaptation complexity': 'Adaptation complexity',
                        'New complexity': 'New complexity',
                        'Data usage complexity': 'Data usage complexity',
                        'Data flow complexity': 'Data flow complexity',
                        'Cohesion complexity': 'Cohesion complexity',
                        'Interface complexity': 'Interface complexity',
                        'Control flow complexity': 'Control flow complexity',
                        'Class complexity': 'Class complexity',
                        'Layout complexity': 'Layout complexity',
                        'Input complexity': 'Input complexity',
                        'Output complexity': 'Output complexity'} 
                        },
        'Cost Driver': {
          'Product':{
            'Type of product': 'Product.Type',
            'Stratum': 'Stratum',
            'Compactness': 'Compactness',
            'Structure': 'Structure',
            'Architecture': 'Architecture',
            'Integration with legacy systems': 'Integration with legacy systems',
            'Concurrency level': 'Concurrency level',
            'Processing requirements': 'Processing requirements',
            'Database size': 'Database size',
            'Requirements volatility level': 'Requirements volatility level',
            'Requirements novelty level': 'Requirements novelty level',
            'Reliability level': 'Reliability level',
            'Maintainability level': 'Maintainability level',
            'Time efficiency level': 'Time efficiency level',
            'Memory efficiency level': 'Memory efficiency level',
            'Portability level': 'Portability level',
            'Scalability level': 'Scalability level',
            'Quality level': 'Quality level',
            'Usability level': 'Usability level',
            'Readability level': 'Readability level',
            'Security level': 'Security level',
            'Installability level': 'Installability level',
            'Modularity level': 'Modularity level',
            'Flexibility level': 'Flexibility level',
            'Testability level': 'Testability level',
            'Accessibility level': 'Accessibility level',
            'Trainability level': 'Trainability level',
            'Innovation level': 'Innovation level',
            'Technical factors': 'Technical factors',
            'Storage constraint': 'Storage constraint',
            'Reusability level': 'Reusability level',
            'Robustness level': 'Robustness level',
            'Design volatility': 'Design volatility',
            'Experience level': 'Experience level',
            'Requirements clarity level': 'Requirements clarity level'},
        'Client': {
            'Availability level': 'Availability level',
            'IT literacy': 'IT literacy',
            'Mapped workflows': 'Mapped workflows',
            'Personality of client': 'Client.Personality'},
            
        'Development Company': {
            'SPI program': 'SPI program',
            'Metrics’ program': 'Metrics’ program',
            'Number of projects in parallel': 'Number of projects in parallel',
            'Software reuse': 'Software reuse'},
        'Project': {
            'Documentation level': 'Documentation level',
            'Number of programming languages': 'Number of programming languages',
            'Type of project': 'Project.Type',
            'Process efficiency level': 'Process efficiency level',
            'Project management level': 'Project management level',
            'Infrastructure': 'Infrastructure',
            'Development restriction': 'Development restriction',
            'Time restriction': 'Time restriction',
            'Risk level': 'Risk level',
            'Rapid app development': 'Rapid app development',
            'Operational mode': 'Operational mode',
            'Resource level': 'Resource level',
            'Lessons learned repository': 'Lessons learned repository'},            
        'Team': {
            'Domain experience level': 'Domain experience level',
            'Team size': 'Team size',
            'Deployment platform experience level': 'Deployment platform experience level',
            'Team capability': 'Team capability',
            'Programming language experience level': 'Programming language experience level',
            'Tool experience level': 'Tool experience level',
            'Communication level': 'Communication level',
            'Software development experience': 'Software development experience',
            'Work Team level': 'Work Team level',
            'Stability level': 'Stability level',
            'Motivation level': 'Motivation level',
            'Focus factor': 'Focus factor',
            'Tool experience level': 'Tool experience level',
            'OO experience level': 'OO experience level',
            'In-house experience': 'In-house experience'},
        'Technology': {
            'Authoring tool type': 'Authoring tool type',
            'Productivity level': 'Productivity level',
            'Novelty level': 'Novelty level',
            'Platform volatility level': 'Platform volatility level',
            'Difficulty level': 'Difficulty level',
            'Platform support level': 'Platform support level'}}
          
}
}

britto1_tax=new_taxonomy
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy)
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')

3rd Paper A specialized global software engineering taxonomy for effort estimation


new_taxonomy = {
    'GSE': {
        'Project': {
            'Site': {
                "Location": "Location",
                "Legal Entity": "Legal Entity",
                "Geographic Distance": "Geographic Distance",
                "Temporal Distance": "Temporal Distance",
                "Estimation stage": {
                    "Early Estimation stage": "Estimation stage.Early",
                    "Early & Late Estimation stage": "Estimation stage.Early & Late",
                    "Late Estimation stage": "Estimation stage.Late"
                },
                "Estimation process role": {
                    "Estimator": "Estimator",
                    "Estimator & Provider": "Estimator & Provider",
                    "Provider": "Provider"
                }
            },
            'Relationship': {
                "Location": "Location",
                "Legal Entity": "Legal Entity",
                "Geographic Distance": "Geographic Distance",
                "Temporal Distance": "Temporal Distance",
                "Estimation process architectural model": {
                    "Centralized": "Centralized",
                    "Distributed": "Distributed",
                    "Semi-distributed": "Semi-distributed"
                }
            }
        }
    }
}

britto2_tax=new_taxonomy
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy)
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')

4rth Paper: A taxonomy of Approaches and Methods for Software Effort Estimation

new_taxonomy = {
    'Software estimation': {
        'Basic Estimating Methods': {
            "Algorithmic": {
                "Constructive Cost Model": "Constructive Cost Model",
                "Software Life Cycle Management": "Software Life Cycle Management",
                "Software Evaluation and Estimation for Risk": "Software Evaluation and Estimation for Risk"
            },
            "Non-Algorithmic": {
                "Expert Judgment": "Expert Judgment",  # Corrected spelling
                "Analogy-Based": "Analogy-Based"
            }
        },
        'Combined Estimating Methods': {
            "Basic-Combination": "Basic-Combination",
            "Legal Entity": "Legal Entity",
            "Estimation process architectural model": {
                "Fuzzy Logic": "Fuzzy Logic",
                "Artificial Neural Networks": "Artificial Neural Networks",
                "Computational Intelligence": {  # Corrected spelling
                    "swarm": "swarm",
                    "evolutionary": "evolutionary"
                }
            },
            "AI-Combined hybrid": "AI-Combined hybrid"
        }
    }
}

dashti_tax=new_taxonomy
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy)
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')

5th Paper, Towards a Taxonomy of Hypermedia and Web Application Size Metrics.


new_taxonomy = {
  "Hypermedia and Web Application Size Metrics":{
    "Motivation":{"Motivation":"Motivation"},
    "Harvesting time":{
      "Early":"Early size metric",
      "Late":"Late size metric"},
    "Metric foundation":{
      "Problem-oriented metric":"Problem-oriented metric",
      "Solution-oriented metric":"Solution-oriented metric"},
    "Class":{
      "Length":"Length",
      "Functionality":"Functionality",
      "Complexity":"Complexity"},
    "Entity":{
      "Web hypermedia application":"Web hypermedia application",
      "Web software application":"Web software application",
      "Web application":"Web application",
      "Media":"Media",
      "Program/Script":"Program/Sript"},
    "Measurement Scale":{
      "Nominal":"Nominal",
      "Ordinal":"Ordinal",
      "Interval":"Interval",
      "Ratio":"Ratio",
      "Absolute":"Absolute"},
    "Computation":{
      "Direct":"Direct",
      "Indirect":"Indirect"},
    "Validation":{
      "Validated Empirically":"Validated Empirically",
      "Validated Theoretically":"Validated Theoretically",
      "Both Empirically and Theoretically":"Validation.Both",
      "No Validation":"Validation.None"},
    "Model dependency":{
      "Specific":"Specific",
      "Nonspecific":"Nonspecific"}
}
}

mendes_tax=new_taxonomy
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy)
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')

##4 6th Paper, An Effort Estimation Taxonomy for Agile Software Development

new_taxonomy = {
    'Effort Estimation in ASD': {
        'Estimation context': {
            "Planning level": {
                "Release Planning level": "Planning level.Release",
                "Sprint Planning level": "Planning level.Sprint",
                "Daily Planning level": "Planning level.Daily",
                "Bidding Planning level": "Planning level.Bidding"
            },
            "Estimated activities": {
                "Analysis": "Analysis",
                "Design": "Design",
                "Implementation": "Implementation",
                "Testing": "Testing",
                "Maintenance": "Maintenance",
                "All estimateed activities": "Estimated activities.All"
            },
            "Agile methods": {
                "Extreme Programming": "Extreme Programming",
                "Scrum": "Scrum",
                "Customized Extreme Programming": "Customized Extreme Programming",
                "Customized Scrum": "Customized Scrum",
                "Dynamic Systems Development Method": "Dynamic Systems Development Method",
                "Crystal": "Crystal",
                "Feature-Driven Development": "Feature-Driven Development",
                "Kanban": "Kanban"
            },
            "Project domain": {
                "Communications industry": "Communications industry",
                "Transportation": "Transportation",
                "Financial": "Financial",
                "Education": "Education",
                "Health": "Health",
                "Retail/Wholesale": "Retail/Wholesale",
                "Manufacturing": "Manufacturing",
                "Government/Military": "Government/Military",
                "Other project domain": "Project somain.Other"
            },
            "Project setting": {
                "Co-located Project setting": "Project setting.Co-located",
                "Distributed: Close Onshore": "Distributed: Close Onshore",
                "Distributed: Distant Onshore": "Distributed: Distant Onshore",
                "Distributed: Near Offshore": "Distributed: Near Offshore",
                "Distributed: Far Offshore": "Distributed: Far Offshore"
            },
            "Estimation entity": {
                "User story Estimation entity": "User story",
                "Task Estimation entity": "Task",
                "Use case Estimation entity": "Use case",
                "Other Estimation entity": "Estimation entity.Other"
            },
            "Number of entities estimated": {
                "Number of entities estimated": "Number of entities estimated"
            },
            "Team size": {
                "No. of team members": "Team size.Value"
            }
        },
        'Estimation technique': {
            "Estimation Techniques": {
                "Planning Poker": "Planning Poker",
                "Expert Judgement": "Expert Judgement",
                "Analogy": "Analogy",
                "Use case points method": "Use case points method",
                "Other estimation technique": "Estimation technique.Other"
            },
            "Type": {
                "Single type": "Type.Single",
                "Group type": "Type.Group"
            }
        },
        'Effort predictors': {
            "Size": {
                "Story points": "Story points",
                "User case points": "User case points",
                "Function points": "Function points",
                "Other Effort predictors": "Other Effort predictors",
                "Not used Effort predictors": "Not used Effort predictors",
                "Considered without any metric": "Considered without any metric"
            },
            "Team's prior experience": {
                "Considered Team's prior experience": "Team's prior experience.Considered",
                "Not Considered Team's prior experience": "Team's prior experience.Not Considered"
            },
            "Team's skill level": {
                "Considered Team's skill level": "Team's skill level.Considered",
                "Not Considered Team's skill level": "Team's skill level.Not Considered"
            },
            "Non functional requirements": {
                "Performance": "Performance",
                "Security": "Security",
                "Availability": "Availability",
                "Reliability": "Reliability",
                "Maintainability": "Maintainability",
                "Other Non functional requirements": "Non functional requirements.Other",  # Changed period to comma
                "Not considered Non functional requirements": "Non functional requirements.Not considered"
            },
            "Distributed teams' issues": {
                "Considered Distributed teams": "Distributed teams.Considered",
                "Not Considered Distributed teams": "Distributed teams.Not Considered",
                "Not applicable Distributed teams": "Distributed teams.Not applicable"
            },
            "Customer Communication": {
                "Considered Customer Communication": "Customer Communication.Considered",
                "Not Considered Customer Communication": "Customer Communication.Not Considered"
            }
        },
        'Effort estimate': {
            "Estimated effort": {
                "Estimate value(s)": "Estimate value(s)"
            },
            "Actual effort": {
                "Actual effort Value": "Actual effort.Value"
            },
            "Type": {
                "Point Type": "Point Type",
                "Three point Type": "Three point Type",
                "Distribution Type": "Distribution Type",
                "Other Type": "Other Type"
            },
            "Unit": {
                "House/days": "House/days",
                "Pair days": "Pair/days",
                "Ideal hours": "Ideal hours",
                "Other Unit": "Unit.Other"
            },
            "Accuracy Level": {
                "Accuracy Level Value": "Accuracy Level.Value"
            },
            "Accuracy measure": {
                "Mean Magnitude of Relative Error": "Mean Magnitude of Relative Error",
                "Median Magnitude of Relative Error": "Median Magnitude of Relative Error",
                "Bias of Relative Error": "Bias of Relative Error",
                "Other Accuracy measure": "Accuracy measure.Other",
                "Not used Accuracy measure": "Accuracy measure.Not used"
            }
        }
    }
}

usman_tax=new_taxonomy
leaves = get_leaf_nodes(new_taxonomy)
print(leaves)

ncat = extract_ncat(new_taxonomy)
nchar = extract_nchar(new_taxonomy)
depths_cat = extract_depths_cat(new_taxonomy)
depths_char = extract_depths_char(new_taxonomy)

print("Number of categories (ncat):", ncat)
print("Number of characteristics (nchar):", nchar)
print("Depths of categories:", depths_cat)
print("Depths of characteristics:", depths_char)

robustness_value = calculate_r_t(new_taxonomy)
print(f"Robustness R(T): {robustness_value:.4f}")
conciseness= calculate_conciseness(ncat, nchar, depths_cat, depths_char)
print(f'The conciseness of the taxonomy is: {conciseness}')
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # Import for 3D plotting
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib
plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'

# Extracting categories sets
def extract_intermediate_elements(taxonomy, result=None):
    if result is None:
        result = set()

    for key, value in taxonomy.items():
        if isinstance(value, dict):
            result.add(key)
            extract_intermediate_elements(value, result)

    return result

bajta_tax_categories = extract_intermediate_elements(bajta_tax)
britto1_tax_categories = extract_intermediate_elements(britto1_tax)
britto2_tax_categories = extract_intermediate_elements(britto2_tax)
dashti_tax_categories = extract_intermediate_elements(dashti_tax)
mendes_tax_categories = extract_intermediate_elements(mendes_tax)
usman_tax_categories = extract_intermediate_elements(usman_tax)

print({'Bajta': bajta_tax_categories})
## {'Bajta': {'Estimation dimension', 'Team experience', 'Research & Dev', 'Accuracy measure', 'Project setting', 'Cost estimate', 'Estimated cost', 'Actual cost', 'Cost estimators', 'Planning approaches', 'Planning', 'Distributed teams distances', 'Product requirement', 'Team size', 'Team structure', 'Project domain', 'Project activities', 'Estimation technique', 'Cost estimation context', 'Cost estimation for GSD', 'Use technique', 'Product size', 'Number of sites'}}
print({'Britto_2017':britto1_tax_categories})
## {'Britto_2017': {'Length', 'Development Company', 'Client', 'Cost Driver', 'Web Predictor', 'Team', 'Object-oriented', 'Technology', 'Complexity', 'Product', 'Size Metric', 'Functionality', 'Project'}}
print({'Britto_2016':britto2_tax_categories})
## {'Britto_2016': {'Estimation process role', 'GSE', 'Estimation process architectural model', 'Site', 'Relationship', 'Estimation stage', 'Project'}}
print({'Dashti':dashti_tax_categories})
## {'Dashti': {'Estimation process architectural model', 'Software estimation', 'Combined Estimating Methods', 'Basic Estimating Methods', 'Computational Intelligence', 'Non-Algorithmic', 'Algorithmic'}}
print({'Mendes':mendes_tax_categories})
## {'Mendes': {'Hypermedia and Web Application Size Metrics', 'Harvesting time', 'Metric foundation', 'Measurement Scale', 'Motivation', 'Class', 'Model dependency', 'Validation', 'Computation', 'Entity'}}
print({'Usman':usman_tax_categories})
## {'Usman': {'Estimated activities', 'Planning level', 'Non functional requirements', 'Estimation context', 'Accuracy Level', 'Accuracy measure', 'Estimation Techniques', 'Effort predictors', 'Number of entities estimated', 'Project setting', 'Estimated effort', "Team's prior experience", 'Estimation entity', 'Actual effort', 'Agile methods', 'Unit', 'Effort estimate', 'Team size', 'Project domain', "Distributed teams' issues", "Team's skill level", 'Estimation technique', 'Size', 'Customer Communication', 'Type', 'Effort Estimation in ASD'}}
sets = {
    'Bajta': bajta_tax_categories,
    'Britto_2017': britto1_tax_categories,
    'Britto_2016': britto2_tax_categories,
    'Dashti': dashti_tax_categories,
    'Mendes': mendes_tax_categories,
    'Usman': usman_tax_categories
}


# Extract characteristics (Execute only one of the two, or characteristics set or categories set)

def extract_leaf_elements(nested_dict):
    """Recursively extract leaf nodes from a nested dictionary."""
    leaves = set()
    for key, value in nested_dict.items():
        if isinstance(value, dict):
            # Recursively process sub-dictionaries
            leaves.update(extract_leaf_elements(value))
        else:
            # Add the current key as it's a leaf node
            leaves.add(value)
    return leaves

# Example usage with your taxonomies
bajta_tax_leaves = extract_leaf_elements(bajta_tax)
britto1_tax_leaves = extract_leaf_elements(britto1_tax)
britto2_tax_leaves = extract_leaf_elements(britto2_tax)
dashti_tax_leaves = extract_leaf_elements(dashti_tax)
mendes_tax_leaves = extract_leaf_elements(mendes_tax)
usman_tax_leaves = extract_leaf_elements(usman_tax)

sets = {
    'Bajta': bajta_tax_leaves,
    'Britto_2017': britto1_tax_leaves,
    'Britto_2016': britto2_tax_leaves,
    'Dashti': dashti_tax_leaves,
    'Mendes': mendes_tax_leaves,
    'Usman': usman_tax_leaves
}



# Output the final dictionary

# Step 2: Flatten the sets into a dataframe (assuming 'sets' is already defined)
words = []
labels = []
for label, words_set in sets.items():
    for word in words_set:
        words.append(word)
        labels.append(label)

# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})

# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")
## Loading model and tokenizer...
# Step 4: Get the embeddings for each word
def get_embeddings(word):
    inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

embeddings = np.array([get_embeddings(word) for word in df['Word']])

# Step 5: Perform t-SNE (now in 2D)
tsne = TSNE(n_components=2, perplexity=30, random_state=5)
embeddings_2d = tsne.fit_transform(embeddings)
## C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
## found 0 physical cores < 1
## Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
##   warnings.warn(
##   File "C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
##     raise ValueError(f"found {cpu_count_physical} physical cores < 1")
# Step 6: Convert string labels to numeric labels for coloring
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(labels)

# Step 7: Create the 2D scatter plot
fig, ax = plt.subplots(figsize=(10, 7))

# Plot the 2D scatter with the numeric labels for colors
scatter = ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                     c=numeric_labels, cmap='Set1', s=100)

# Annotate each point with the word
for i, word in enumerate(df['Word']):
    ax.text(embeddings_2d[i, 0] + 0.1, embeddings_2d[i, 1] + 0.1, word, fontsize=9)

# Step 8: Add labels and title
ax.set_title("2D t-SNE Visualization of Word Embeddings")
ax.set_xlabel("t-SNE Dimension 1")
ax.set_ylabel("t-SNE Dimension 2")

# Step 9: Move the legend outside of the plot
legend_labels = label_encoder.classes_
handles = [plt.Line2D([0], [0], marker='o', color='w', 
                      markerfacecolor=plt.cm.Set2(i / len(legend_labels)), markersize=5) 
           for i in range(len(legend_labels))]
ax.legend(handles, legend_labels, title="Set", loc="center left", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)

# Step 10: Show the plot
plt.tight_layout()  # Ensures proper spacing with the legend outside
plt.savefig('word_embeddings.png', dpi=300, bbox_inches='tight')
plt.show()

K-means Plot

import random
import umap.umap_ as umap
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib.lines import Line2D  # Add this import at the top of your code
colorstyle = "Set2"
seed=5
marker_styles = ['o', '^', 's', 'p', '*', 'D']
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
## <torch._C.Generator object at 0x000000007FA84030>
plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'


# Step 2: Flatten the sets into a dataframe
words = []
labels = []
for label, words_set in sets.items():
    for word in words_set:
        words.append(word.lower())
        labels.append(label)

# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})

# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Step 4: Get the embeddings for each word
def get_embeddings(word):
    inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

embeddings = np.array([get_embeddings(word) for word in df['Word']])

# Step 5: Perform 2D UMAP
umap_model = umap.UMAP(n_components=2, random_state=5)
embeddings_2d = umap_model.fit_transform(embeddings)
## C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
##   warn(
# Step 6: Create a color map that reflects the set labels
unique_labels = list(df['Set'].unique())  # Get the unique set labels
cmap = plt.cm.get_cmap(colorstyle, len(unique_labels))  # Create a colormap with enough colors
## <string>:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
# Step 7: Run K-means on UMAP embeddings
num_clusters = len(unique_labels)  # Set number of clusters to match unique labels
kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=5)
kmeans_labels = kmeans.fit_predict(embeddings_2d)

# Step 8: Generate top 4 names for each cluster
top_n = 3  # Set how many top words to display for each cluster
cluster_names = []

for i in range(num_clusters):
    # Get the embeddings for words in the current cluster
    cluster_indices = np.where(kmeans_labels == i)[0]
    cluster_embeddings = embeddings[cluster_indices]
    
    # Calculate the centroid of the cluster
    cluster_centroid = np.mean(cluster_embeddings, axis=0).reshape(1, -1)
    
    # Calculate cosine similarity of centroid to all words' embeddings to find closest words
    similarities = cosine_similarity(cluster_centroid, embeddings).flatten()
    
    # Get the indices of the top 4 closest words
    closest_word_indices = np.argsort(similarities)[-top_n:][::-1]  # Get indices of top 4 closest words
    
    # Get the words corresponding to these indices
    closest_words = df['Word'].iloc[closest_word_indices].tolist()
    
    # Store the top 4 closest words as the cluster name
    cluster_names.append(closest_words)

# Step 9: Plot with translucent shapes for each K-means cluster and annotate with top 4 names
plt.figure(figsize=(10, 7))
color_map = {label: cmap(i) for i, label in enumerate(unique_labels)}

# Create a list of marker styles to use for each label
marker_styles = ['o', '^', 's', 'p', '*', 'D']  # Add more marker styles if needed

# Loop through each label and plot with the corresponding marker style
plt.figure(figsize=(10, 7))

for i, label in enumerate(unique_labels):
    # Get the data for the current label
    label_data = df[df['Set'] == label]
    
    # Plot with a different marker for each label
    plt.scatter(embeddings_2d[df['Set'] == label, 0], 
                embeddings_2d[df['Set'] == label, 1],
                c=[color_map[label]] * len(label_data), 
                s=80, 
                label=label,
                marker=marker_styles[i % len(marker_styles)], alpha=0.6)  # Use modulo to cycle through marker styles


# Draw convex hulls around each cluster and annotate with cluster names
for i in range(num_clusters):
    cluster_points = embeddings_2d[kmeans_labels == i]
    
    if len(cluster_points) >= 3:  # ConvexHull requires at least 3 points
        hull = ConvexHull(cluster_points)
        hull_points = cluster_points[hull.vertices]
        plt.fill(hull_points[:, 0], hull_points[:, 1], alpha=0.2, 
                 color=cmap(i), label=f'Cluster {i+1}')
    
    # Annotate with the top 4 cluster names at the centroid location
    cluster_centroid_2d = np.mean(cluster_points, axis=0)
    # Join the top 4 words into a string with commas for cleaner display
    cluster_name_text = '\n'.join(cluster_names[i]).upper() 
    
    # Annotate with the top words at the centroid, with slightly smaller font size
    plt.text(cluster_centroid_2d[0], cluster_centroid_2d[1], cluster_name_text, 
             fontsize=8, ha='center', color='black')

# Step 10: Custom legend to show colors and shapes for each label
plt.title("2D UMAP Visualization of Word Embeddings with K-means Clusters")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")

legend_elements = [Line2D([0], [0], marker=marker_styles[i % len(marker_styles)], color='w', 
                          markerfacecolor=color_map[label], markersize=10, label=label)
                   for i, label in enumerate(unique_labels)]
plt.legend(
    handles=legend_elements,
    title="Literature",
    loc="lower center",
    bbox_to_anchor=(0.5, -0.2),  # Position it just below the plot
    ncol=len(unique_labels),      # Arrange legend items in a single row
    frameon=False                 # Optional: Remove legend box frame
)
# Adjust layout to ensure the legend is not clipped
plt.tight_layout()

# Step 11: Save the plot in high resolution
plt.savefig('word_embeddings_kmeans.png', dpi=600, bbox_inches='tight')

# Show the plot
plt.show()

import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'

# Define the sets of words

# Combine all sets into a single list with labels
word_sets = sets

word_sets = {label: {word.lower() for word in words} for label, words in word_sets.items()}

# Load model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Function to get embedding for a word
def get_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Collect embeddings
embeddings = []
words = []
labels = []

for label, words_set in word_sets.items():
    for word in words_set:
        embedding = get_embedding(word)
        embeddings.append(embedding)
        words.append(word)
        labels.append(label)

# Create a DataFrame with words, labels, and embeddings
embedding_df = pd.DataFrame({
    "Word": words,
    "Label": labels,
    "Embedding": [emb[0] for emb in embeddings]
})

# Pivot the DataFrame to have the set labels as columns
pivoted_df = embedding_df.pivot(index="Word", columns="Label", values="Embedding")

# Flatten the embeddings (if you want to display them properly as vectors, you might want to separate them)
# Convert the embedding vectors to string for display purposes (or keep them as arrays if you're working with them in computations)
pivoted_df = pivoted_df.applymap(lambda x: str(x.tolist()) if isinstance(x, np.ndarray) else x)
## <string>:4: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
# Display the pivoted DataFrame
print(pivoted_df)
## Label                     Bajta  ...                                              Usman
## Word                             ...                                                   
## absolute                    NaN  ...                                                NaN
## accessibility level         NaN  ...                                                NaN
## accuracy level.value        NaN  ...  [1.221323013305664, -2.9064228534698486, 0.267...
## accuracy measure.not used   NaN  ...  [0.8682874441146851, -2.1167731285095215, 0.68...
## accuracy measure.other      NaN  ...  [1.2351722717285156, -2.535403251647949, 1.183...
## ...                         ...  ...                                                ...
## web objects                 NaN  ...                                                NaN
## web page allocation         NaN  ...                                                NaN
## web page count              NaN  ...                                                NaN
## web software application    NaN  ...                                                NaN
## work team level             NaN  ...                                                NaN
## 
## [346 rows x 6 columns]

#3D PLOT

import plotly.express as px
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
import umap.umap_ as umap
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'
# Step 2: Flatten the sets into a dataframe (assuming sets is already defined)
words = []
labels = []
for label, words_set in sets.items():
    for word in words_set:
        words.append(word)
        labels.append(label)

# Create a dataframe
df = pd.DataFrame({'Word': words, 'Set': labels})

# Step 3: Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Step 4: Get the embeddings for each word
def get_embeddings(word):
    inputs = tokenizer(word, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

embeddings = np.array([get_embeddings(word) for word in df['Word']])

# Step 5: Perform 3D UMAP (with 3 components)
umap_model = umap.UMAP(n_components=3, random_state=5)
embeddings_3d = umap_model.fit_transform(embeddings)
## C:\Users\mysit\AppData\Local\Programs\Python\Python39\lib\site-packages\umap\umap_.py:1952: UserWarning:
## 
## n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
# Step 6: Convert string labels to numeric labels for coloring
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(labels)

# Step 7: Create the interactive 3D plot with Plotly
fig = px.scatter_3d(df, x=embeddings_3d[:, 0], y=embeddings_3d[:, 1], z=embeddings_3d[:, 2],
                    color=labels, text=words,
                    labels={'x': 'UMAP Dimension 1', 'y': 'UMAP Dimension 2', 'z': 'UMAP Dimension 3'},
                    title="3D UMAP Visualization of Word Embeddings")

# Customize the layout for better viewing
fig.update_traces(marker=dict(size=5, opacity=0.8), selector=dict(mode='markers+text'))
fig.update_layout(scene=dict(xaxis_title='UMAP Dimension 1',
                             yaxis_title='UMAP Dimension 2',
                             zaxis_title='UMAP Dimension 3'))
plt.savefig('3d_word_embedding.png', dpi=300, bbox_inches='tight')

# Show the interactive plot
fig.show()

Another table showing the common words between papers, a bit harder to read #The defining one. Needed to execute the following chunks of code

import torch
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

plt.clf()

plt.style.use('seaborn-v0_8-whitegrid')  # You can change this to any available style

plt.rcParams['font.family'] = 'serif'

# Define the sets of words
sets=sets


# Load the pre-trained model and tokenizer
model_name = "jinaai/jina-embeddings-v3"
if 'model' not in locals() or 'tokenizer' not in locals():
    print("Loading model and tokenizer...")
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
    print("Model and tokenizer are already loaded.")
## Model and tokenizer are already loaded.
# Function to normalize text to lowercase
def normalize_words(words):
    return {word.lower() for word in words}

# Normalize all words in the sets to lowercase
normalized_sets = {set_name: normalize_words(word_set) for set_name, word_set in sets.items()}

# Function to get embeddings for a list of words
def get_embeddings(words):
    inputs = tokenizer(list(words), padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Create a dictionary to store the embeddings of each set
embeddings = {}
for set_name, word_set in normalized_sets.items():
    embeddings[set_name] = get_embeddings(word_set)

# Create a function to calculate the semantic similarity between sets
def compute_similarity(set1, set2):
    # Get the embeddings for both sets
    embeddings1 = embeddings[set1]
    embeddings2 = embeddings[set2]
    
    # Calculate cosine similarity between all pairs of words in set1 and set2
    sim_matrix = cosine_similarity(embeddings1, embeddings2)
    
    return sim_matrix

# Create a similarity matrix for each pair of sets
similarity_results = {}
for set1 in normalized_sets.keys():
    for set2 in normalized_sets.keys():
        if set1 != set2:
            sim_matrix = compute_similarity(set1, set2)
            similarity_results[(set1, set2)] = sim_matrix

# Create a simple table to store the similarity values
similarity_table = []

# Populate the table with word pairs and their cosine similarity values
for (set1, set2), sim_matrix in similarity_results.items():
    for i, word1 in enumerate(normalized_sets[set1]):
        for j, word2 in enumerate(normalized_sets[set2]):
            similarity_table.append({
                "Set 1": set1,
                "Word 1": word1,
                "Set 2": set2,
                "Word 2": word2,
                "Cosine Similarity": sim_matrix[i, j]
            })

# Convert the table to a DataFrame for better display
similarity_df = pd.DataFrame(similarity_table)

# Filter the DataFrame to keep only cosine similarities above 0.7
similarity_df_filtered = similarity_df[similarity_df['Cosine Similarity'] > 0.9]

# Create an empty table to store the words that are similar
common_words_table = pd.DataFrame(index=sets.keys(), columns=sets.keys(), dtype=object)

# Populate the table with word pairs that have similarity above 0.7
for index, row in similarity_df_filtered.iterrows():
    set1 = row['Set 1']
    word1 = row['Word 1']
    set2 = row['Set 2']
    word2 = row['Word 2']
    
    # Check if the cell is empty or needs to be updated with word pairs
    if pd.isna(common_words_table.at[set1, set2]):
        common_words_table.at[set1, set2] = f"{word1} - {word2}"
    else:
        common_words_table.at[set1, set2] += f", {word1} - {word2}"

# Display the table showing the common word pairs
print(common_words_table)
##                                                          Bajta  ...                                              Usman
## Bajta                                                      NaN  ...  availability - availability, estimated value -...
## Britto_2017  maintainability level - maintainability, relia...  ...  maintainability level - maintainability, relia...
## Britto_2016  geographic distance - geographical distance, t...  ...                                                NaN
## Dashti       constructive cost model - constructive cost mo...  ...  expert judgment - expert judgement, analogy-ba...
## Mendes                                                     NaN  ...                                                NaN
## Usman        security - security, estimate value(s) - estim...  ...                                                NaN
## 
## [6 rows x 6 columns]

making a color table

import pandas as pd
from pandas.io.formats.style import Styler

# Step 1: Define the color map for each set
set_colors = {
    "Bajta": "yellow",
    "Britto_2016": "blue",
    "Britto_2017": "green",
    "Dashti": "red",
    "Mendes": "purple",
    "Usman": "orange"
}

# Create the HTML content for the legend
legend_html = "<div style='font-weight: bold; margin-bottom: 10px;'>Legend:</div>"
for set_name, color in set_colors.items():
    legend_html += f"<div><span style='color:{color};'>●</span> {set_name}</div>"


# Step 2: Create the common words table (we will assume this step has already been completed and filtered)
common_words_table = pd.DataFrame(index=sets.keys(), columns=["Words", "Relations"])

# Step 3: Populate the common words table with colored word pairs
for index, row in similarity_df_filtered.iterrows():
    set1 = row['Set 1']
    word1 = row['Word 1']
    set2 = row['Set 2']
    word2 = row['Word 2']
    
    # Color the words based on the sets
    word1_colored = f'<span style="color:{set_colors[set1]}">{word1}</span>'
    word2_colored = f'<span style="color:{set_colors[set2]}">{word2}</span>'
    
    # Find the row corresponding to set1
    current_row = common_words_table.loc[set1]
    
    if pd.isna(current_row['Words']):
        common_words_table.at[set1, 'Words'] = word1
        common_words_table.at[set1, 'Relations'] = word2_colored
    else:
        common_words_table.at[set1, 'Words'] += f", {word1}"
        common_words_table.at[set1, 'Relations'] += f", {word2_colored}"

# Step 4: Use `map` to apply the coloring function
def colorize_words(word):
    """
    Function to apply color to words using the <span> HTML tag.
    """
    return f"color: {word.split(':')[1]}" if isinstance(word, str) and word.startswith('<span') else ''

# Step 5: Apply the styling to the DataFrame
styled_table = common_words_table.style.applymap(colorize_words, subset=["Words", "Relations"])
## <string>:3: FutureWarning:
## 
## Styler.applymap has been deprecated. Use Styler.map instead.
# Display the styled table (if using Jupyter or IPython environment)
styled_table